a <- c(2, 4, 4, 6)
b <- c(5, 5, 7, 8)
c <- c(9, 9, 9, 8)
d <- c(1, 2, 3, 3)
#row bind four vectors into matrix
mat <- rbind(a, b, c, d)
#view matrix
mat
dist(mat, method="euclidean")
dist(mat, method="manhattan")
dist(mat, method="minkowski")
# generate matrices
set.seed(50)
unitib = gen_random_matrix('uniform', 1000, 10)
normtib = gen_random_matrix('normal', 8, 3)
gtib = gen_random_matrix('2groups', 20, 3, group_split = .2)
zerotib = gen_const_matrix(8,3,0)
onetib = gen_const_matrix(8,3,1)
gtib = gen_random_matrix('3groups', 100, 5)
ugtib = dist_uniq(gtib, 'euclidean')
desc_stats_uniq(ugtib)
gtib = gen_random_matrix('2groups', 100, 2)
ugtib = dist_uniq(gtib, 'euclidean')
print(desc_stats_uniq(ugtib))
tib = gen_random_matrix('normal', 1000, 5)
ugtib = dist_uniq(tib, 'euclidean')
print(desc_stats_uniq(ugtib))
ugtib = dist_uniq(tib, 'euclidean')
ugtib = dist_uniq(tib, 'mahalanobis')
ugtib = dist_uniq(tib, 'manhattan')
desc_stats_uniq(ugtib)
cov(gen_random_matrix('2groups',100,100))
ugtib = dist_uniq(gen_random_matrix('2groups',100,100), 'mahalanobis')
tib = gen_random_matrix('2groups',100,100)
maha_sum_dists = mahalanobis(tib, colMeans(tib), cov(tib), tol=1e-20)
# test uniqueness
tib = unitib
if (F){
# scale variables 0 1
#tib = tib %>% mutate_if(is.numeric, range01)
}
#cos = tcosine(tib)
#mat = gen_random_matrix('normal', 1000, 100)
#bigdist(as.matrix(mat), 'analysis/dist_matrix.tmp', method='euclidean')
#dist_uniq()
#utib$sum_dists_div = sum_dists / nrow(utib)
#utib$sum_dists_log = log10(utib$sum_dists+1)
#utib$sum_dists_sqrt = sqrt(utib$sum_dists)
#n = 3
#utib$sum_dists_nrt = utib$sum_dists ^ (1 / n)
#utib$uniq_lin = 1/(1+utib$sum_dists)
#utib$uniq_log = 1/(1+utib$sum_dists_log)
#utib$uniq_sqrt = 1/(1+utib$sum_dists_sqrt)
#utib$sum_cos = rowSums(cos)
#utib$sum_cos_pos = utib$sum_cos-min(utib$sum_cos)
#utib$sum_cos_log = log10(utib$sum_cos+1)
#View(round(utib,4))
#summary(utib)
# plot histograms
#tib_long <- utib %>% pivot_longer(colnames(utib)) %>% as_tibble()
#gp1 <- ggplot(tib_long, aes(x = value)) + # Draw each column as histogram
#geom_histogram(bins = 10) +
# geom_density(adjust = 1/2) +
# facet_wrap(~ name, scales = "free_x")
#print(gp1)
# characteristics of uniq to be saved for diagnostic
#x = scale(utib$sum_dists)
#desc_stats(x)
This is similar to outlier detection, in the sense of detection of extreme values, and not of measurement errors. For a single variable, there are well-known methods, e.g., boxplots, n StDev, etc.
“Enderlein (1987) goes even further as the author considers outliers as values that deviate so much from other observations one might suppose a different underlying sampling mechanism. … it sometimes makes sense to formally distinguish two classes of outliers: (i) extreme values and (ii) mistakes.” https://statsandr.com/blog/outliers-detection-in-r
In a multi-variate context, the outliers we are interested in are rare combinations of values. Individually, the values of each variable might not be extreme, but the combination appears relatively far from others.
“The adjusted quantile plot function of the mvoutlier package will solve for and order the squared Mahalanobis Distances for the given observations and plot them against the empirical Chi-Squared distribution function of these values.” https://rpubs.com/Treegonaut/301942
# use mvoutlier
indf = as.data.frame(gen_random_matrix('2groups', 20, 3))
outliers <- aq.plot(indf, delta=qchisq(0.975, df = ncol(indf)))
The Mahalnobis distance is often used for OD in multivariate data.
“In multivariate space, the Mahalanobis distance is the distance between two points. It’s frequently used to locate outliers in statistical investigations involving several variables.” Source: https://www.r-bloggers.com/2021/08/how-to-calculate-mahalanobis-distance-in-r/
intib = gen_random_matrix('2groups', 20, 4)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
intib
## V1 V2 V3 V4
## 1 1.1641803 -0.1948692 -0.4252416 -0.22249308
## 2 424.9662196 266.5979769 -101.5830758 120.47922412
## 3 -1819.0693641 1951.0394591 -140.3411740 216.32643610
## 4 0.1291146 0.1794147 0.7936816 0.55376173
## 5 0.5753677 -0.2868437 0.5837242 -0.43088498
## 6 -0.4859220 0.6387479 -0.9638295 0.06068727
## 7 0.2567009 0.6207820 -0.4172149 0.37015321
## 8 -0.7100911 1.3510034 -0.4412846 0.01016097
## 9 -1.6257058 1.5630141 -0.7880726 -0.95895454
## 10 -1.2711655 1.1788008 0.9085291 1.23311558
## 11 0.1919319 -1.0570429 0.5355723 0.98882670
## 12 -0.4938220 0.4070899 -0.4872650 -0.42588735
## 13 -1.6795307 1.8772010 -0.4288508 0.40217567
## 14 -0.6717679 -0.2808324 -0.2731496 1.64284754
## 15 0.5810927 0.1421171 -0.5369906 -0.23387121
## 16 -0.1177264 -1.8474959 0.6532794 1.04005465
## 17 408.4206719 99.9786286 1757.5022732 2498.60880008
## 18 -488.8363871 1247.0478823 1830.2949026 -25.89555311
## 19 0.4493133 0.5170706 1.1878005 -0.17358978
## 20 0.3674473 0.6878928 0.7081729 -0.78370560
# x: input data
# center: indicate the mean vector of the distribution
# cov: indicate the covariance matrix of the distribution
intib$maha_dist = mahalanobis(intib, colMeans(intib), cov(intib))
# The p-value is the Chi-Square statistic of the
# Mahalanobis distance with k-1 degrees of freedom,
# where k is the number of variables.
intib$maha_pvalue <- pchisq(intib$maha_dist, df=ncol(intib), lower.tail=F)
#View(intib)
intib$maha_dist
## [1] 0.2356693 18.0496287 18.0499380 0.2372853 0.2383430 0.2367667
## [7] 0.2347773 0.2350168 0.2373395 0.2371455 0.2414191 0.2382379
## [13] 0.2355562 0.2401074 0.2360051 0.2454820 18.0499686 18.0499539
## [19] 0.2357943 0.2355655
intib
## V1 V2 V3 V4 maha_dist maha_pvalue
## 1 1.1641803 -0.1948692 -0.4252416 -0.22249308 0.2356693 0.998681165
## 2 424.9662196 266.5979769 -101.5830758 120.47922412 18.0496287 0.002884845
## 3 -1819.0693641 1951.0394591 -140.3411740 216.32643610 18.0499380 0.002884465
## 4 0.1291146 0.1794147 0.7936816 0.55376173 0.2372853 0.998659209
## 5 0.5753677 -0.2868437 0.5837242 -0.43088498 0.2383430 0.998644726
## 6 -0.4859220 0.6387479 -0.9638295 0.06068727 0.2367667 0.998666277
## 7 0.2567009 0.6207820 -0.4172149 0.37015321 0.2347773 0.998693196
## 8 -0.7100911 1.3510034 -0.4412846 0.01016097 0.2350168 0.998689972
## 9 -1.6257058 1.5630141 -0.7880726 -0.95895454 0.2373395 0.998658469
## 10 -1.2711655 1.1788008 0.9085291 1.23311558 0.2371455 0.998661116
## 11 0.1919319 -1.0570429 0.5355723 0.98882670 0.2414191 0.998602097
## 12 -0.4938220 0.4070899 -0.4872650 -0.42588735 0.2382379 0.998646169
## 13 -1.6795307 1.8772010 -0.4288508 0.40217567 0.2355562 0.998682694
## 14 -0.6717679 -0.2808324 -0.2731496 1.64284754 0.2401074 0.998620367
## 15 0.5810927 0.1421171 -0.5369906 -0.23387121 0.2360051 0.998676620
## 16 -0.1177264 -1.8474959 0.6532794 1.04005465 0.2454820 0.998544636
## 17 408.4206719 99.9786286 1757.5022732 2498.60880008 18.0499686 0.002884428
## 18 -488.8363871 1247.0478823 1830.2949026 -25.89555311 18.0499539 0.002884446
## 19 0.4493133 0.5170706 1.1878005 -0.17358978 0.2357943 0.998679475
## 20 0.3674473 0.6878928 0.7081729 -0.78370560 0.2355655 0.998682569
This shows the behaviour of the sum of distances on clustered data. The visualisation shows the distances of each point to all other points.
gauss_mix = generate_gaussian_mix(3, # number of clusters
list(sample(1:10, 2), sample(1:10, 2), sample(1:20, 2)), # centres
sample(1:40, 3)) # sizes
# calculate sum of all distances for each point
gauss_mix$dist_sum = dist(gauss_mix, diag = T, upper = T) %>% as.matrix() %>% rowSums()
gauss_mix$dist_sum_z = zscore(gauss_mix$dist_sum)
# plot results
# scatter plot
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum, size=log(dist_sum))) +
scale_colour_gradient() + ggtitle('Sum of distances') +
theme_bw()
# scatter plot z score
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum_z, size=dist_sum_z)) +
scale_colour_distiller(palette='RdYlBu',direction=1) +
ggtitle('Sum of distances (z score)') +
theme_bw()
# histogram
ggplot(gauss_mix, aes(dist_sum)) + geom_histogram(bins = 20)
Uniqueness can be thought of as \(1 - p\), where p is the probability of encountering a particular observation from random extractions from a set.
For example, these are the percentages of land cover categories in the UK: Farmland 56.7%, Natural 34.9%, Green urban 2.5%, Built on 5.9%. Taking a probabilistic view, we can define the uniqueness of a category as 1-p. The rarest category we would encounter by selecting a random area in the UK is green urban (\(U = .98\)).
Data source: https://www.eea.europa.eu/publications/COR0-landcover
uk_landcover = tibble(cat=c('farm','natural','green urban','built'), pc=c(56.7, 34.9, 2.5, 5.9))
uk_landcover$p = uk_landcover$pc/100
uk_landcover$uniq = 1 - uk_landcover$p
uk_landcover
## # A tibble: 4 x 4
## cat pc p uniq
## <chr> <dbl> <dbl> <dbl>
## 1 farm 56.7 0.567 0.433
## 2 natural 34.9 0.349 0.651
## 3 green urban 2.5 0.025 0.975
## 4 built 5.9 0.059 0.941
To make it more interpretable, we can use the z score. This way, more unique observations emerge from the data.
uk_landcover$uniq_z = round(zscore(uk_landcover$uniq),2)
uk_landcover
## # A tibble: 4 x 5
## cat pc p uniq uniq_z[,1]
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 farm 56.7 0.567 0.433 -1.24
## 2 natural 34.9 0.349 0.651 -0.39
## 3 green urban 2.5 0.025 0.975 0.88
## 4 built 5.9 0.059 0.941 0.74
We can think of uniqueness as a deviation from the uniform distribution. If all types of observation occur at the same probability, it is not possible to calculate a meaningful uniqueness score (\(z\) is NA).
uniform_tib = tibble(cat=c('cat1','cat2','cat3','cat4'), pc=1/4, uniq=1-1/4)
uniform_tib$uniq_z = zscore(uniform_tib$uniq)
print(uniform_tib)
## # A tibble: 4 x 4
## cat pc uniq uniq_z[,1]
## <chr> <dbl> <dbl> <dbl>
## 1 cat1 0.25 0.75 NaN
## 2 cat2 0.25 0.75 NaN
## 3 cat3 0.25 0.75 NaN
## 4 cat4 0.25 0.75 NaN
We can use measures of entropy to see if the data has heterogeneity that can be used to quantify uniqueness. High entropy means low heterogeneity (no observation will be unique) and vice-versa.
print(paste('Entropy of uniform data', round(entropy(uniform_tib$pc),2)))
## [1] "Entropy of uniform data 1.39"
print(paste('Entropy of heterogeneous data', round(entropy(uk_landcover$pc),2)))
## [1] "Entropy of heterogeneous data 0.95"
The multivariate case is more interesting and challenging. It is useful for exploratory data analysis (EDA).
In this example, we consider four observations along two dimensions. The geometric interpretation of uniqueness of observation \(a\) in this case is \(u_a = \sum dist(a,b)\). To make the results more interpretable, we can obtain the \(z\) scores. It is possible to observe visually that point 1 is the most spatially isolated, while point 2 is the most central to the dataset and therefore has the lowest uniqueness.
The core function is dist_uniq(...) in the uniq_functions.R.
multiv_tib = as_tibble(matrix(c(-1,0,1,0,-1,0,1,1), ncol = 2))
print(multiv_tib)
## # A tibble: 4 x 2
## V1 V2
## <dbl> <dbl>
## 1 -1 -1
## 2 0 0
## 3 1 1
## 4 0 1
# plot
ggplot(multiv_tib, aes(x=V1, y=V2, label=rownames(multiv_tib))) + theme(aspect.ratio = 1) +
geom_point(size=3, colour='blue', fill='blue') + geom_text(nudge_y = .1) +
ggtitle('Location of points')
# get distance matrix
dist_mat = as.matrix(dist(multiv_tib, diag = T, upper = T))
print(dist_mat)
## 1 2 3 4
## 1 0.000000 1.414214 2.828427 2.236068
## 2 1.414214 0.000000 1.414214 1.000000
## 3 2.828427 1.414214 0.000000 1.000000
## 4 2.236068 1.000000 1.000000 0.000000
# sum distances
multiv_tib$sum_dist = round(rowSums(dist_mat),2)
multiv_tib$sum_dist_z = round(zscore(multiv_tib$sum_dist),2)
# plot uniqueness
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist)) + theme(aspect.ratio = 1) +
geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) +
ggtitle('Sum of distances to other points')
# plot z scores of uniqueness
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist_z)) + theme(aspect.ratio = 1) +
geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) +
ggtitle('Sum of distances to other points (z scores)')
On high dimensional data, the interpretation of \(u\) is less immediate. Therefore, it is useful to observe the behaviour of the uniqueness measure \(u\) through a Monte Carlo method.
A core idea is that data with uniformly and normally distributed variables can still produce high values of \(u\), generating false positives in the search for unique observations. The Shapiro-Wilk test on \(u\) consistently returns low W on heterogeneous distributions with actual groups and high W on homogeneous data, regardless of scale.
#t = gen_random_matrix('normal', 10, 5)
#u = calc_uniqueness(t, 'euclidean')
#View(u$uniqueness)
#ggplot(u$uniqueness, aes(x=uniq_dist_avg)) + xlab("uniq_dist_avg") +
# geom_histogram(color="white", fill="lightgreen", bins = 10)#
#ggplot(u$uniqueness, aes(x=uniq)) + #xlab("uniq_dist_avg") +
# geom_histogram(color="white", fill="gray", bins = 10)
if (T){
# synthetic data
res = tibble()
set.seed(NULL)
for (rand_data in c('normal','uniform','2groups','3groups'))
for (dist_method in c('euclidean','manhattan','minkowski','mahalanobis'))
for (data_scale in c(1,100,1000))
for (row_n in c(10,100,1000,10000))
for (col_n in c(2,10,20))
for (trial in seq(10))
{
if (row_n == 0 | col_n == 0) next()
#if (dist_method != 'mahalanobis') next()
rand_tib = gen_random_matrix(rand_data, row_n, col_n, data_scale)
utib = dist_uniq(rand_tib, dist_method)
res_row = desc_stats_uniq(utib)
res_row = res_row %>% add_column(distrib = rand_data,
dist_method = dist_method,
data_scale = data_scale,
row_n = row_n,
col_n = col_n,
sample_n = nrow(utib),
trial = trial,
.before = "n")
res = bind_rows(res, res_row)
}
saveRDS(res, 'analysis/high_dimensional_experiment_1.rds')
openxlsx::write.xlsx(res, 'analysis/high_dimensional_experiment_1.xlsx')
}
Summarise experiment 1 results to observe trends in \(u\) in different settings.
res_tib = readRDS('analysis/high_dimensional_experiment_1.rds')
#%>% #mutate_if(is.character, factor)
print(nrow(res_tib))
## [1] 5760
summary_res = tibble()
for (cols in list(c('row_n'), c('col_n'),
c('distrib','row_n'), c('distrib','col_n'), c('distrib'),
c('dist_method'), c('data_scale'),
c('distrib','dist_method'),
c('distrib','dist_method','data_scale'))){
print(cols)
cols_str = paste(cols, collapse = ' ')
res_stats_tib = res_tib %>%
#group_by(distrib, dist_method) %>%
group_by(across(all_of(cols))) %>% # all_of
dplyr::summarise(cols = cols_str,
n = n(),
mn_dist_min = mean(min),
mn_dist_max = mean(max),
mn_vcommon = mean(pc_very_common),
mn_common = mean(pc_common),
mn_inbet = mean(pc_inbet),
mn_rare = mean(pc_rare),
mn_vrare = mean(pc_very_rare),
mn_skew = mean(skewness)) %>%
mutate_if(is.numeric, round, 2)
summary_res = bind_rows(summary_res, res_stats_tib)
}
## [1] "row_n"
## [1] "col_n"
## [1] "distrib" "row_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib" "col_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib"
## [1] "dist_method"
## [1] "data_scale"
## [1] "distrib" "dist_method"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib" "dist_method" "data_scale"
## `summarise()` has grouped output by 'distrib', 'dist_method'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Columns `distrib`, `dist_method`
summary_res
## # A tibble: 110 x 15
## row_n cols n mn_dist_min mn_dist_max mn_vcommon mn_common mn_inbet
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10 row_n 1440 -1.03 2.01 87.9 3.38 5.68
## 2 100 row_n 1440 -1.16 3.37 88.3 3.17 4.77
## 3 1000 row_n 1440 -1.28 4.58 88.6 3.2 4.41
## 4 10000 row_n 1440 -1.28 4.96 88.7 3.19 4.38
## 5 NA col_n 1920 -0.85 4.77 89.0 3.05 4.31
## 6 NA col_n 1920 -1.32 3.45 88.2 3.32 4.71
## 7 NA col_n 1920 -1.39 2.98 87.8 3.33 5.41
## 8 10 distrib ro… 360 -0.68 2.09 83.6 2.64 11.2
## 9 100 distrib ro… 360 -0.46 3.38 84.2 2.89 8.61
## 10 1000 distrib ro… 360 -0.46 4.72 84.8 3.23 7.24
## # … with 100 more rows, and 7 more variables: mn_rare <dbl>, mn_vrare <dbl>,
## # mn_skew <dbl>, col_n <dbl>, distrib <chr>, dist_method <chr>,
## # data_scale <dbl>
Impact of variables on distribution of uniqueness values.
Different distributions generate different common/rare distribution. 1,440 cases observed for each combination. Uniform and normally-distributed data generate a similar pattern, with normal data with more having more rare/very rare observations (2%, 1.1%). Uneven distributions with groups, as expected by their design, generate more rare/very rare observations (4.1% and 7.3%), having a large central group of close observations and relatively many distant observations:
Using the Dunn’s test, the only two distributions that are not significantly different are the normal and the uniform, which generate similar uniqueness patterns (TODO: why?).
# impact of data distribution
#summary_res %>% filter(cols == 'distrib') %>% dplyr::select(distrib, n, mn_dist_min, mn_dist_max, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% flextable()
summary_results_tib = summary_res
# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'distrib') %>%
dplyr::select(distrib, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
tib %>% flextable()
distrib | n | mn_vcommon | mn_common | mn_inbet | mn_rare | mn_vrare |
2groups | 1,440 | 84.39 | 2.99 | 8.54 | 3.16 | 0.92 |
3groups | 1,440 | 90.68 | 0.54 | 1.66 | 4.50 | 2.63 |
normal | 1,440 | 89.07 | 4.47 | 4.47 | 1.45 | 0.54 |
uniform | 1,440 | 89.35 | 4.94 | 4.57 | 1.06 | 0.08 |
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$distrib){
pcs = tib %>% filter(distrib == i) %>%
dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
sz = tib %>% filter(distrib == i) %>% dplyr::select(n) %>% .[[1]]
vals = generate_cat_data_from_pc(pcs, u_labels, sz)
row_tib = tibble(group = i, val = vals)
all_vals = bind_rows(all_vals, row_tib)
}
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 val 2groups 3groups 1438 1436 8.14 3.88e-16 2.33e-15 ****
## 2 val 2groups normal 1438 1437 3.03 2.44e- 3 7.33e- 3 **
## 3 val 2groups uniform 1438 1438 2.50 1.25e- 2 2.50e- 2 *
## 4 val 3groups normal 1436 1437 -5.11 3.19e- 7 1.28e- 6 ****
## 5 val 3groups uniform 1436 1438 -5.65 1.65e- 8 8.23e- 8 ****
## 6 val normal uniform 1437 1438 -0.533 5.94e- 1 5.94e- 1 ns
The size of the dataset (n observations x n variables) has minimal impact on the behaviour of \(u\). Varying the number of observations from 10 to 10,000, it is possible to observe that with very few observations (10) it is less likely to obtain very rare observations, as expected. The other three rows converge and do not have significant differences (using Dunn’s test):
Rows:
# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'row_n') %>%
dplyr::select(row_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$row_n){
pcs = tib %>% filter(row_n == i) %>%
dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
sz = tib %>% filter(row_n == i) %>% dplyr::select(n) %>% .[[1]]
vals = generate_cat_data_from_pc(pcs, u_labels, sz)
row_tib = tibble(group = i, val = vals)
all_vals = bind_rows(all_vals, row_tib)
}
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 val 10 100 1438 1438 2.34 0.0190 0.0761 ns
## 2 val 10 1000 1438 1439 2.68 0.00736 0.0368 *
## 3 val 10 10000 1438 1437 2.78 0.00537 0.0322 *
## 4 val 100 1000 1438 1439 0.335 0.738 1 ns
## 5 val 100 10000 1438 1437 0.439 0.660 1 ns
## 6 val 1000 10000 1439 1437 0.105 0.916 1 ns
Columns do not have impact on the results:
# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'col_n') %>%
dplyr::select(col_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$col_n){
print(i)
pcs = tib %>% filter(col_n == i) %>%
dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
sz = tib %>% filter(col_n == i) %>% dplyr::select(n) %>% .[[1]]
print(paste('generating synth data:',sz))
vals = generate_cat_data_from_pc(pcs, u_labels, sz)
row_tib = tibble(group = i, val = vals)
all_vals = bind_rows(all_vals, row_tib)
}
## [1] 2
## [1] "generating synth data: 1920"
## [1] 10
## [1] "generating synth data: 1920"
## [1] 20
## [1] "generating synth data: 1920"
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 val 2 10 1917 1917 -1.49 0.137 0.274 ns
## 2 val 2 20 1917 1916 -2.23 0.0260 0.0779 ns
## 3 val 10 20 1917 1916 -0.740 0.460 0.460 ns
The four distance methods do not generate significantly different proportions of \(u\) results.
# compare distance metrics
tib = summary_results_tib %>% filter(cols == 'dist_method') %>%
dplyr::select(dist_method, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$dist_method){
pcs = tib %>% filter(dist_method == i) %>%
dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
sz = tib %>% filter(dist_method == i) %>% dplyr::select(n) %>% .[[1]]
vals = generate_cat_data_from_pc(pcs, u_labels, sz)
row_tib = tibble(group = i, val = vals)
all_vals = bind_rows(all_vals, row_tib)
}
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 val euclidean mahalanobis 1437 1437 1.01 0.313 1 ns
## 2 val euclidean manhattan 1437 1437 -0.111 0.912 1 ns
## 3 val euclidean minkowski 1437 1437 -0.128 0.898 1 ns
## 4 val mahalanobis manhattan 1437 1437 -1.12 0.263 1 ns
## 5 val mahalanobis minkowski 1437 1437 -1.14 0.256 1 ns
## 6 val manhattan minkowski 1437 1437 -0.0174 0.986 1 ns
Data scale don’t make any difference to the results either.
# compare data scale
tib = summary_results_tib %>% filter(cols == 'data_scale') %>%
dplyr::select(data_scale, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$data_scale){
pcs = tib %>% filter(data_scale == i) %>%
dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
sz = tib %>% filter(data_scale == i) %>% dplyr::select(n) %>% .[[1]]
vals = generate_cat_data_from_pc(pcs, u_labels, sz)
row_tib = tibble(group = i, val = vals)
all_vals = bind_rows(all_vals, row_tib)
}
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 val 1 100 1918 1917 0.248 0.805 1 ns
## 2 val 1 1000 1918 1918 0.297 0.766 1 ns
## 3 val 100 1000 1917 1918 0.0499 0.960 1 ns
TODO
Comparing boroughs of Greater London w.r.t. socio-economic variables.
lnd_boro = st_read('data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson')
## Reading layer `london_boroughs_profiles_2015' from data source `/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/calculating-uniqueness/data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson' using driver `GeoJSON'
## Simple feature collection with 33 features and 89 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -0.5103751 ymin: 51.28676 xmax: 0.3340156 ymax: 51.69187
## geographic CRS: WGS 84
# select variables
print(names(lnd_boro))
## [1] "fid"
## [2] "gss"
## [3] "name"
## [4] "hectares"
## [5] "nonld_area"
## [6] "ons_inner"
## [7] "area_name"
## [8] "inner_outer_london"
## [9] "gla_population_estimate_2017"
## [10] "gla_household_estimate_2017"
## [11] "inland_area_hectares"
## [12] "population_density_per_hectare_2017"
## [13] "average_age_2017"
## [14] "proportion_of_population_aged_0_15_2015"
## [15] "proportion_of_population_of_working_age_2015"
## [16] "proportion_of_population_aged_65_and_over_2015"
## [17] "net_internal_migration_2015"
## [18] "net_international_migration_2015"
## [19] "net_natural_change_2015"
## [20] "pc_of_resident_population_born_abroad_2015"
## [21] "largest_migrant_population_by_country_of_birth_2011"
## [22] "pc_of_largest_migrant_population_2011"
## [23] "second_largest_migrant_population_by_country_of_birth_2011"
## [24] "pc_of_second_largest_migrant_population_2011"
## [25] "third_largest_migrant_population_by_country_of_birth_2011"
## [26] "pc_of_third_largest_migrant_population_2011"
## [27] "pc_of_population_from_bame_groups_2016"
## [28] "pc_people_aged_3_whose_main_language_is_not_english_2011_census"
## [29] "overseas_nationals_entering_the_uk_nino_2015_16"
## [30] "new_migrant_nino_rates_2015_16"
## [31] "largest_migrant_population_arrived_during_2015_16"
## [32] "second_largest_migrant_population_arrived_during_2015_16"
## [33] "third_largest_migrant_population_arrived_during_2015_16"
## [34] "employment_rate_pc_2015"
## [35] "male_employment_rate_2015"
## [36] "female_employment_rate_2015"
## [37] "unemployment_rate_2015"
## [38] "youth_unemployment_claimant_rate_18_24_dec_15"
## [39] "proportion_of_16_18_year_olds_who_are_neet_pc_2014"
## [40] "proportion_of_the_working_age_population_who_claim_out_of_work_benefits_pc_may_2016"
## [41] "pc_working_age_with_a_disability_2015"
## [42] "proportion_of_working_age_people_with_no_qualifications_pc_2015"
## [43] "proportion_of_working_age_with_degree_or_equivalent_and_above_pc_2015"
## [44] "gross_annual_pay_2016"
## [45] "gross_annual_pay_male_2016"
## [46] "gross_annual_pay_female_2016"
## [47] "modelled_household_median_income_estimates_2012_13"
## [48] "pc_adults_that_volunteered_in_past_12_months_2010_11_to_2012_13"
## [49] "number_of_jobs_by_workplace_2014"
## [50] "pc_of_employment_that_is_in_public_sector_2014"
## [51] "jobs_density_2015"
## [52] "number_of_active_businesses_2015"
## [53] "two_year_business_survival_rates_started_in_2013"
## [54] "crime_rates_per_thousand_population_2014_15"
## [55] "fires_per_thousand_population_2014"
## [56] "ambulance_incidents_per_hundred_population_2014"
## [57] "median_house_price_2015"
## [58] "average_band_d_council_tax_charge__2015_16"
## [59] "new_homes_net_2015_16_provisional"
## [60] "homes_owned_outright_2014_pc"
## [61] "being_bought_with_mortgage_or_loan_2014_pc"
## [62] "rented_from_local_authority_or_housing_association_2014_pc"
## [63] "rented_from_private_landlord_2014_pc"
## [64] "pc_of_area_that_is_greenspace_2005"
## [65] "total_carbon_emissions_2014"
## [66] "household_waste_recycling_rate_2014_15"
## [67] "number_of_cars_2011_census"
## [68] "number_of_cars_per_household_2011_census"
## [69] "pc_of_adults_who_cycle_at_least_once_per_month_2014_15"
## [70] "average_public_transport_accessibility_score_2014"
## [71] "achievement_of_5_or_more_a_c_grades_at_gcse_or_equivalent_including_english_and_maths_2013_14"
## [72] "rates_of_children_looked_after_2016"
## [73] "pc_of_pupils_whose_first_language_is_not_english_2015"
## [74] "pc_children_living_in_out_of_work_households_2015"
## [75] "male_life_expectancy_2012_14"
## [76] "female_life_expectancy_2012_14"
## [77] "teenage_conception_rate_2014"
## [78] "life_satisfaction_score_2011_14_out_of_10"
## [79] "worthwhileness_score_2011_14_out_of_10"
## [80] "happiness_score_2011_14_out_of_10"
## [81] "anxiety_score_2011_14_out_of_10"
## [82] "childhood_obesity_prevalance_pc_2015_16"
## [83] "people_aged_17_with_diabetes_pc"
## [84] "mortality_rate_from_causes_considered_preventable_2012_14"
## [85] "political_control_in_council"
## [86] "proportion_of_seats_won_by_conservatives_in_2014_election"
## [87] "proportion_of_seats_won_by_labour_in_2014_election"
## [88] "proportion_of_seats_won_by_lib_dems_in_2014_election"
## [89] "turnout_at_2014_local_elections"
## [90] "geometry"
lnd_boro = lnd_boro %>% select(gss, name, population_density_per_hectare_2017, average_age_2017, pc_of_resident_population_born_abroad_2015, employment_rate_pc_2015, modelled_household_median_income_estimates_2012_13, proportion_of_seats_won_by_conservatives_in_2014_election, proportion_of_seats_won_by_labour_in_2014_election, median_house_price_2015) %>%
rename(id = gss,
pop_dens = population_density_per_hectare_2017,
average_age = average_age_2017,
pop_born_abroad = pc_of_resident_population_born_abroad_2015,
employment = employment_rate_pc_2015,
household_income=modelled_household_median_income_estimates_2012_13,
conservative_seats=proportion_of_seats_won_by_conservatives_in_2014_election,
labour_seats=proportion_of_seats_won_by_labour_in_2014_election,
house_price=median_house_price_2015)
plot(lnd_boro %>% select(-id,-name), border=NA)
# integrate missing data
# from https://data.gov.uk/dataset/248f5f04-23cf-4470-9216-0d0be9b877a8/london-borough-profiles-and-atlas/datafile/1e5d5226-a1a7-4097-afbf-d3bd39226c39/preview
#lnd_boro[lnd_boro$name == "Kensington and Chelsea", "gross_annual_pay_2016"] = 63620
Find problematic variables.
# get correlations
lnd_boro_corr = lnd_boro %>% st_drop_geometry() %>% select(-name,-id) %>% correlate(method='kendall') %>% stretch()
##
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# observe high correlations
lnd_boro_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 10 x 3
## x y r
## <chr> <chr> <dbl>
## 1 pop_dens average_age -0.514
## 2 average_age pop_dens -0.514
## 3 average_age conservative_seats 0.586
## 4 average_age labour_seats -0.627
## 5 household_income conservative_seats 0.501
## 6 conservative_seats average_age 0.586
## 7 conservative_seats household_income 0.501
## 8 conservative_seats labour_seats -0.697
## 9 labour_seats average_age -0.627
## 10 labour_seats conservative_seats -0.697
# remove Labour variable
lnd_boro = lnd_boro %>% select(-labour_seats)
lnd_boro_desc_stats = t(stat.desc(lnd_boro %>% st_drop_geometry() %>% select(-id,-name), norm=T))
lnd_boro_desc_stats
## nbr.val nbr.null nbr.na min max
## pop_dens 33 0 0 21.84036 155.6205
## average_age 33 0 0 31.40000 43.2000
## pop_born_abroad 32 0 1 10.90000 54.1000
## employment 33 0 0 64.60000 79.6000
## household_income 33 0 0 28780.00000 63620.0000
## conservative_seats 32 5 1 0.00000 85.0000
## house_price 33 0 0 243500.00000 1200000.0000
## range sum median mean
## pop_dens 133.7801 2457.754 59.17539 74.47740
## average_age 11.8000 1200.400 36.20000 36.37576
## pop_born_abroad 43.2000 1168.400 36.90000 36.51250
## employment 15.0000 2399.600 73.10000 72.71515
## household_income 34840.0000 1308190.000 37040.00000 39642.12121
## conservative_seats 85.0000 1046.898 30.00000 32.71556
## house_price 956500.0000 15360443.000 410000.00000 465467.96970
## SE.mean CI.mean.0.95 var std.dev
## pop_dens 6.856818e+00 13.966880 1.551526e+03 3.938942e+01
## average_age 4.330790e-01 0.882153 6.189394e+00 2.487849e+00
## pop_born_abroad 1.855380e+00 3.784072 1.101579e+02 1.049561e+01
## employment 7.345005e-01 1.496128 1.780320e+01 4.219384e+00
## household_income 1.287259e+03 2622.061541 5.468221e+07 7.394742e+03
## conservative_seats 4.776588e+00 9.741915 7.301052e+02 2.702046e+01
## house_price 3.557386e+04 72461.579214 4.176148e+10 2.043563e+05
## coef.var skewness skew.2SE kurtosis kurt.2SE
## pop_dens 0.52887744 0.5506993 0.6738271 -0.9455118 -0.59211886
## average_age 0.06839306 0.4105474 0.5023395 0.2371737 0.14852807
## pop_born_abroad 0.28745261 -0.4072860 -0.4913485 -0.1454105 -0.08982929
## employment 0.05802620 -0.2030418 -0.2484388 -1.0326806 -0.64670761
## household_income 0.18653750 1.3316678 1.6294081 1.8164895 1.13756141
## conservative_seats 0.82592087 0.3562270 0.4297511 -1.2819967 -0.79197072
## house_price 0.43903399 1.8238259 2.2316052 3.3048996 2.06966585
## normtest.W normtest.p
## pop_dens 0.9229907 2.219048e-02
## average_age 0.9747445 6.211098e-01
## pop_born_abroad 0.9603938 2.815670e-01
## employment 0.9631147 3.158213e-01
## household_income 0.8804765 1.723490e-03
## conservative_seats 0.9128969 1.340429e-02
## house_price 0.7935325 2.435519e-05
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).
The only variable whose skewness is a concern is median_house_price_2015. It will be normalised with log10.
# normalise and scale
lnd_boro$house_price = log10(lnd_boro$house_price+1)
lnd_boro$household_income = log10(lnd_boro$household_income+1)
# skewness is now acceptable
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Multidimensional scaling of boroughs.
mds <- lnd_boro %>% select(-name,-id) %>% st_drop_geometry() %>%
mutate_if(is.numeric, scale) %>% dist() %>% cmdscale() %>% as_tibble()
colnames(mds) <- c("Dim1", "Dim2")
mds$name = lnd_boro$name
# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2)) +
ggtitle("Multi-dimensional scaling of Greater London boroughs") +
geom_point(size=.2) +
geom_text(
label=mds$name,
size=2,
#nudge_x = .25,
nudge_y = .1,
check_overlap = F
) + theme_bw()
rm(mds)
Calculate uniqueness of boroughs based on all variables.
# 'euclidean','manhattan', 'minkowski', mahalanobis'
# filter out very rare
#lnd_boro = lnd_boro %>% filter(!(id %in% c('E09000001','E09000020')))
method = 'euclidean'
lnd_boro_uniq = dist_uniq(lnd_boro %>% st_drop_geometry(), method)
## [1] "dist_uniq 33 9 euclidean"
## [1] "scaling data..."
#lnd_boro_u_man = dist_uniq(lnd_boro %>% st_drop_geometry(), 'manhattan')
#lnd_boro_u_mink = dist_uniq(lnd_boro %>% st_drop_geometry(), 'minkowski')
#lnd_boro_u_maha = dist_uniq(lnd_boro %>% st_drop_geometry(), 'mahalanobis', impute_na = T)
openxlsx::write.xlsx(lnd_boro_uniq %>% mutate_if(is.numeric, round, 4),
paste0('analysis/london_boroughs_uniqueness-',method,'.xlsx'))
# bar chart
#lnd_boro_uniq %>% select(sum_dists_class) %>% plot()
# add geometries
lnd_boro_uniq_geo = merge(lnd_boro %>% select(id), lnd_boro_uniq)
# maps
plot_uniq_map(lnd_boro_uniq_geo)
## tmap mode set to plotting
## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
# heatmaps
gen_variable_heatmap(lnd_boro_uniq_geo, T, 50)
## # A tibble: 297 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 City of London pop_dens -1.12
## 2 City of London average_age 2.74
## 3 City of London pop_born_abroad NA
## 4 City of London employment -1.92
## 5 City of London household_income 2.84
## 6 City of London conservative_seats NA
## 7 City of London house_price 1.69
## 8 City of London sum_dists_z 3.43
## 9 City of London sum_dists_z_p 0.000297
## 10 Kensington and Chelsea pop_dens 1.44
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).
gen_variable_heatmap(lnd_boro_uniq_geo, F, 50)
## # A tibble: 297 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 Waltham Forest pop_dens -0.0839
## 2 Waltham Forest average_age -0.513
## 3 Waltham Forest pop_born_abroad 0.0655
## 4 Waltham Forest employment 0.0912
## 5 Waltham Forest household_income -0.964
## 6 Waltham Forest conservative_seats -0.224
## 7 Waltham Forest house_price -0.463
## 8 Waltham Forest sum_dists_z -1.03
## 9 Waltham Forest sum_dists_z_p 0.849
## 10 Greenwich pop_dens -0.388
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).
# choropleth
# Based on <https://mharinga.github.io/choropleth.html> (accessed in May 2022).
#u_colors = rev(toupper(c('#f0f9e8','#bae4bc','#7bccc4','#43a2ca','#0868ac')))
#lnd_boro_u_geo %>% select(sum_dists_class) %>% plot(border = 'lightgray',
# pal = u_colors,
# main='Uniqueness of London boroughs')
Calculate uniqueness of countries based on World Bank indicators (2019).
Select countries with at least 1 million people:
wb_geo = read_sf("data/world_bank/indicators_by_year/world_bank_indicators_countries-2019.geojson") %>% filter(!is.na(iso3c) & sp.pop.totl >= 1e6 & iso_a3 != '-99')
names(wb_geo)
## [1] "iso_a2" "iso_a3" "name"
## [4] "name_long" "abbrev" "type"
## [7] "continent" "region_un" "subregion"
## [10] "region_wb" "un_a3" "wb_a2"
## [13] "wb_a3" "iso3c" "country"
## [16] "date" "it.net.user.zs" "ms.mil.xpnd.gd.zs"
## [19] "ne.trd.gnfs.zs" "ny.gdp.mktp.cd" "ny.gdp.mktp.pp.cd"
## [22] "sl.isv.ifrm.zs" "sp.pop.grow" "sp.pop.totl"
## [25] "sp.rur.totl.zs" "sp.urb.totl.in.zs" "region"
## [28] "income_group" "eu" "geometry"
nrow(wb_geo)
## [1] 157
stopifnot(is_unique(wb_geo$iso_a3))
stopifnot(is_unique(wb_geo$iso_a2))
# use Winkel Tripel for visualisations
crs_wintri <- "+proj=wintri +datum=WGS84 +no_defs +over"
wb_geo <- st_transform(wb_geo, crs = crs_wintri)
Select and rename variables.
wb_tib = wb_geo %>% st_drop_geometry() %>% rename(
population = sp.pop.totl,
pop_growth = sp.pop.grow,
urban_pop = sp.urb.totl.in.zs,
gdp = ny.gdp.mktp.pp.cd,
gdp_trade = ne.trd.gnfs.zs,
internet_access = it.net.user.zs,
military_exp = ms.mil.xpnd.gd.zs
) %>% mutate(pc_gdp = gdp / population) %>%
select(iso_a2, iso_a3, name, region_un, population, pop_growth, gdp, pc_gdp,
urban_pop, internet_access, military_exp) # income_group,
wb_tib = wb_tib
wb_tib %>% sample_n(10)
## # A tibble: 10 x 11
## iso_a2 iso_a3 name region_un population pop_growth gdp pc_gdp urban_pop
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 KG KGZ Kyrg… Asia 6456900 2.10 3.54e10 5486. 36.6
## 2 IE IRL Irel… Europe 4941444 1.51 4.36e11 88241. 63.4
## 3 SE SWE Swed… Europe 10285453 1.08 5.74e11 55820. 87.7
## 4 ZM ZMB Zamb… Africa 17861030 2.89 6.47e10 3624. 44.1
## 5 SK SVK Slov… Europe 5454073 0.134 1.86e11 34067. 53.7
## 6 LK LKA Sri … Asia 21803000 0.612 2.98e11 13657. 18.6
## 7 VE VEN Vene… Americas 28515829 -1.24 NA NA 88.2
## 8 CA CAN Cana… Americas 37589262 1.42 1.93e12 51342. 81.5
## 9 MX MEX Mexi… Americas 127575529 1.09 2.63e12 20582. 80.4
## 10 SS SSD S. S… Africa 11062113 0.782 NA NA 19.9
## # … with 2 more variables: internet_access <dbl>, military_exp <dbl>
Transform problematic variables:
# get correlations
wb_corr = wb_tib %>% select_if(is.numeric) %>% correlate(method='kendall', use="na.or.complete") %>% stretch()
##
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
# observe high correlations
wb_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 8 x 3
## x y r
## <chr> <chr> <dbl>
## 1 population gdp 0.510
## 2 gdp population 0.510
## 3 pc_gdp urban_pop 0.553
## 4 pc_gdp internet_access 0.794
## 5 urban_pop pc_gdp 0.553
## 6 urban_pop internet_access 0.554
## 7 internet_access pc_gdp 0.794
## 8 internet_access urban_pop 0.554
# histograms to see normality
ggplot(gather(wb_tib %>% select_if(is.numeric)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).
The only variables whose skewness are a serious concern are population and gdp. They will be normalised with log10.
# transform and scale
wb_trans_tib = wb_tib %>% mutate( gdp = log10(gdp+1),
population = log10(population+1),
pc_gdp = log10(pc_gdp+1),
military_exp = log10(military_exp+1))
ggplot(gather(wb_trans_tib %>% select_if(is.numeric)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).
Multi-dimensional scaling of countries to show clustering:
mds <- wb_trans_tib %>% select(-iso_a2,-name) %>%
mutate_if(is.numeric, scale) %>%
dist() %>% cmdscale() %>% as_tibble()
## Warning in dist(.): NAs introduced by coercion
colnames(mds) <- c("Dim1", "Dim2")
mds$name = wb_trans_tib$iso_a3
mds$region_un = wb_trans_tib$region_un
# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2, color=region_un)) +
ggtitle("Multi-dimensional scaling of countries") +
#geom_point(size=0) +
geom_text(
label = mds$name,
size = 2,
#nudge_x = .25, nudge_y = .25
#check_overlap = T
) + theme_bw()
rm(mds)
#meth = 'mahalanobis'
meth = 'manhattan'
#meth = 'euclidean'
filt_wb_trans_tib = wb_trans_tib %>% select(-population,-gdp)
wb_uniq = dist_uniq(filt_wb_trans_tib, meth, impute_na = T)
## [1] "dist_uniq 157 9 manhattan"
## [1] "imputing missing values..."
## [1] "scaling data..."
summary(wb_uniq)
## iso_a2 iso_a3 name region_un
## Length:157 Length:157 Length:157 Length:157
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## pop_growth.V1 pc_gdp.V1 urban_pop.V1
## Min. :-2.7301313 Min. :-2.3996204 Min. :-2.1146868
## 1st Qu.:-0.7484615 1st Qu.:-0.7282080 1st Qu.:-0.7710306
## Median : 0.0270864 Median : 0.0835194 Median : 0.0424196
## Mean : 0.0000000 Mean : 0.0000000 Mean : 0.0000000
## 3rd Qu.: 0.7608670 3rd Qu.: 0.8291936 3rd Qu.: 0.8433711
## Max. : 2.7727087 Max. : 1.8291785 Max. : 1.7855400
## internet_access.V1 military_exp.V1 sum_dists sum_dists_dec
## Min. :-1.8469114 Min. :-2.418894 Min. : 669.1 Min. : 1.000
## 1st Qu.:-0.9510277 1st Qu.:-0.531860 1st Qu.: 779.1 1st Qu.: 3.000
## Median : 0.2865862 Median :-0.084567 Median : 854.3 Median : 5.000
## Mean : 0.0000000 Mean : 0.000000 Mean : 885.0 Mean : 5.433
## 3rd Qu.: 0.8885776 3rd Qu.: 0.412638 3rd Qu.: 961.8 3rd Qu.: 8.000
## Max. : 1.4961384 Max. : 3.388462 Max. :1361.3 Max. :10.000
## sum_dists_z sum_dists_z_p sum_dists_class sum_dists_p_thresh
## Min. :-1.4363 Min. :0.0007658 very rare : 2 ***: 2
## 1st Qu.:-0.7043 1st Qu.:0.3046804 rare : 2 ** : 2
## Median :-0.2043 Median :0.5809292 medium : 7 * : 7
## Mean : 0.0000 Mean :0.5255461 common : 7 . : 7
## 3rd Qu.: 0.5110 3rd Qu.:0.7593628 very common:139 :139
## Max. : 3.1686 Max. :0.9245428
## dist_method
## Length:157
## Class :character
## Mode :character
##
##
##
openxlsx::write.xlsx(wb_uniq %>% mutate_if(is.numeric, round, 4),
paste0('analysis/wb_countries_uniqueness-',method,'.xlsx'))
wb_uniq %>% head(20)
## # A tibble: 20 x 16
## iso_a2 iso_a3 name region_un pop_growth[,1] pc_gdp[,1] urban_pop[,1]
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 BI BDI Burundi Africa 1.60 -2.40 -2.11
## 2 NE NER Niger Africa 2.18 -1.98 -1.97
## 3 BH BHR Bahrain Asia 2.77 1.16 1.31
## 4 OM OMN Oman Asia 1.46 0.724 1.13
## 5 KW KWT Kuwait Asia 0.320 1.25 1.79
## 6 SA SAU Saudi Arabia Asia 0.320 1.20 1.07
## 7 MW MWI Malawi Africa 1.17 -2.10 -1.94
## 8 CD COD Dem. Rep. Co… Africa 1.65 -2.07 -0.685
## 9 TD TCD Chad Africa 1.47 -1.75 -1.66
## 10 PG PNG Papua New Gu… Oceania 0.567 -0.873 -2.11
## 11 UG UGA Uganda Africa 1.97 -1.47 -1.62
## 12 MD MDA Moldova Europe -2.73 0.0816 -0.789
## 13 MZ MOZ Mozambique Africa 1.40 -1.94 -1.07
## 14 ET ETH Ethiopia Africa 1.12 -1.46 -1.76
## 15 MG MDG Madagascar Africa 1.18 -1.72 -1.01
## 16 SG SGP Singapore Asia -0.143 1.83 1.79
## 17 IL ISR Israel Asia 0.521 1.06 1.45
## 18 LR LBR Liberia Africa 0.982 -1.84 -0.390
## 19 TG TGO Togo Africa 0.976 -1.75 -0.811
## 20 BF BFA Burkina Faso Africa 1.35 -1.47 -1.36
## # … with 9 more variables: internet_access <dbl[,1]>, military_exp <dbl[,1]>,
## # sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## # sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## # dist_method <chr>
Based on https://r-spatial.github.io/sf/articles/sf5.html
# correlations
corr_tib = wb_uniq %>% select_if(is.numeric) %>% correlate(method='kendall') %>% stretch() %>% filter(x=='sum_dists') %>% arrange(-r)
##
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# plot distributions
ggplot(gather(wb_uniq %>% select_if(is.numeric) %>%
select(sum_dists, sum_dists_z, sum_dists_z_p)),
aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
# data for maps
wb_uniq_geo = merge(wb_geo %>% select(iso_a3), wb_uniq, by='iso_a3')
Generate world maps with uniqueness information:
wb_uniq_geo <- st_transform(wb_uniq_geo, crs = crs_wintri)
plot_uniq_map(wb_uniq_geo)
## tmap mode set to plotting
## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
gen_variable_heatmap(wb_uniq_geo)
## # A tibble: 210 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 Burundi pop_growth 1.60
## 2 Burundi pc_gdp -2.40
## 3 Burundi urban_pop -2.11
## 4 Burundi internet_access -1.76
## 5 Burundi military_exp 0.174
## 6 Burundi sum_dists_z 3.17
## 7 Burundi sum_dists_z_p 0.000766
## 8 Niger pop_growth 2.18
## 9 Niger pc_gdp -1.98
## 10 Niger urban_pop -1.97
## # … with 200 more rows
gen_variable_heatmap(wb_uniq_geo, F)
## # A tibble: 210 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 Bolivia pop_growth 0.0801
## 2 Bolivia pc_gdp -0.269
## 3 Bolivia urban_pop 0.427
## 4 Bolivia internet_access -0.377
## 5 Bolivia military_exp -0.164
## 6 Bolivia sum_dists_z -1.44
## 7 Bolivia sum_dists_z_p 0.925
## 8 Paraguay pop_growth -0.0373
## 9 Paraguay pc_gdp 0.0570
## 10 Paraguay urban_pop 0.0716
## # … with 200 more rows
European regions at NUTS 2 level (https://ec.europa.eu/eurostat/web/nuts/background).
# load EU data and remove French Guyana for cartography
eu_geo = read_sf("data/eurostat/eu_vars_with_bounds/eu_nuts_2_eurostat_2018.geojson") %>%
filter(!(NUTS_ID %in% c('FRY3','FRY2','FRY1','FRY4','FRY5')))
summary(eu_geo)
## NUTS_ID id CNTR_CODE NUTS_NAME
## Length:327 Length:327 Length:327 Length:327
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## LEVL_CODE FID geo broadband_access_household
## Min. :2 Length:327 Length:327 Min. : 84.0
## 1st Qu.:2 Class :character Class :character 1st Qu.: 97.0
## Median :2 Mode :character Mode :character Median : 99.0
## Mean :2 Mean : 97.5
## 3rd Qu.:2 3rd Qu.: 99.0
## Max. :2 Max. :100.0
## NA's :152
## disp_income_household gender_employ_gap life_expectancy_f life_expectancy_m
## Min. :12300 Min. : 0.70 Min. :77.30 Min. :70.1
## 1st Qu.:16125 1st Qu.: 7.90 1st Qu.:82.00 1st Qu.:76.4
## Median :17450 Median :10.20 Median :83.50 Median :79.0
## Mean :18742 Mean :13.49 Mean :83.33 Mean :78.2
## 3rd Qu.:20075 3rd Qu.:15.75 3rd Qu.:84.90 3rd Qu.:80.4
## Max. :47000 Max. :48.80 Max. :88.10 Max. :83.0
## NA's :277 NA's :4 NA's :4 NA's :4
## life_expectancy_t region_gdp region_population unemployment_rate_f
## Min. :73.60 Min. : 1365 Min. : 29489 Min. : 1.600
## 1st Qu.:79.30 1st Qu.: 16739 1st Qu.: 940271 1st Qu.: 3.500
## Median :81.50 Median : 34901 Median : 1506232 Median : 5.450
## Mean :80.79 Mean : 53363 Mean : 1889104 Mean : 7.891
## 3rd Qu.:82.60 3rd Qu.: 65104 3rd Qu.: 2310766 3rd Qu.: 9.350
## Max. :85.50 Max. :733875 Max. :15029231 Max. :36.300
## NA's :4 NA's :17 NA's :17
## unemployment_rate_m unemployment_rate_t geometry
## Min. : 0.800 Min. : 1.300 MULTIPOLYGON :327
## 1st Qu.: 3.800 1st Qu.: 3.600 epsg:4326 : 0
## Median : 5.300 Median : 5.400 +proj=long...: 0
## Mean : 6.745 Mean : 7.064
## 3rd Qu.: 8.200 3rd Qu.: 8.675
## Max. :22.900 Max. :29.000
## NA's :14 NA's :5
# project data to ETRS 89 LAEA, suitable for Europe
eu_geo <- eu_geo %>% st_transform(3035)
# get more variables
#eu_tib = read_tsv('data/eurostat/eu_all_geo_data.tsv')
#summary(eu_tib)
eu_geo = eu_geo %>% select(NUTS_ID, CNTR_CODE, NUTS_NAME, region_population, life_expectancy_t, region_gdp, unemployment_rate_t) %>% mutate(pc_gdp = region_gdp/region_population*1e6)
eu_geo %>% sample_n(10)
## Simple feature collection with 10 features and 8 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 3377114 ymin: 2143784 xmax: 5375179 ymax: 3390452
## projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 10 x 9
## NUTS_ID CNTR_CODE NUTS_NAME region_populati… life_expectancy… region_gdp
## * <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 DE23 DE Oberpfalz 1104407 80.9 47273.
## 2 ITC4 IT Lombardia 10036258 84 388065.
## 3 PL43 PL Lubuskie 1004127 76.9 10798.
## 4 RS22 RS Регион Јужне … 1513383 75.3 6058.
## 5 ITH3 IT Veneto 4905037 84 163304.
## 6 PL21 PL Małopolskie 3349498 79.1 40427.
## 7 NL33 NL Zuid-Holland 3681044 81.9 163805.
## 8 FRG0 FR Pays de la Lo… 3772852 83.3 119149.
## 9 ITI2 IT Umbria 884640 84.3 22482.
## 10 DE50 DE Bremen 681032 79.7 33708.
## # … with 3 more variables: unemployment_rate_t <dbl>,
## # geometry <MULTIPOLYGON [m]>, pc_gdp <dbl>
Transform variables:
eu_geo %>% select_if(is.numeric) %>% st_drop_geometry() %>% correlate(method='kendall', use="na.or.complete") %>% stretch() %>% arrange(-r)
##
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
## # A tibble: 25 x 3
## x y r
## <chr> <chr> <dbl>
## 1 region_population region_gdp 0.509
## 2 region_gdp region_population 0.509
## 3 region_gdp pc_gdp 0.472
## 4 pc_gdp region_gdp 0.472
## 5 life_expectancy_t pc_gdp 0.384
## 6 pc_gdp life_expectancy_t 0.384
## 7 life_expectancy_t region_gdp 0.287
## 8 region_gdp life_expectancy_t 0.287
## 9 life_expectancy_t unemployment_rate_t 0.130
## 10 unemployment_rate_t life_expectancy_t 0.130
## # … with 15 more rows
# plot distributions
ggplot(gather(eu_geo %>% st_drop_geometry() %>% select_if(is.numeric)),
aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).
# transform
eu_trans_geo = eu_geo %>% mutate(region_population = log10(region_population+1),
region_gdp = log10(region_gdp+1),
pc_gdp = log10(pc_gdp+1),
unemployment_rate_t = log10(unemployment_rate_t+1))
ggplot(gather(eu_trans_geo %>% st_drop_geometry() %>% select_if(is.numeric)),
aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).
# select variables for uniqueness
eu_trans_geo_filt = eu_trans_geo %>% select(-region_gdp)
#method = 'euclidean'
method = 'mahalanobis'
eu_uniq = dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method)
## [1] "dist_uniq 327 7 mahalanobis"
## [1] "scaling data..."
## Warning in dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method):
## distances could not be calculated for 22 cases. Set impute_na to TRUE to impute
## missing values.
openxlsx::write.xlsx(eu_uniq %>% mutate_if(is.numeric, round, 4),
paste0('analysis/eu_nuts2_uniqueness-',method,'.xlsx'))
eu_uniq %>% head(20) # %>% View(.)
## # A tibble: 20 x 14
## NUTS_ID CNTR_CODE NUTS_NAME region_populatio… life_expectancy_…
## <chr> <chr> <chr> <dbl> <dbl>
## 1 ES63 ES Ciudad Autónoma de Ceu… -3.39 0.287
## 2 ES64 ES Ciudad Autónoma de Mel… -3.39 -0.0341
## 3 UKI3 UK Inner London - West -0.216 1.57
## 4 BE10 BE Région de Bruxelles-Ca… -0.183 0.287
## 5 BG31 BG Северозападен -0.748 -2.88
## 6 TRC2 TR Sanliurfa, Diyarbakir 1.17 -0.555
## 7 FR10 FR Ile-de-France 2.62 1.49
## 8 TRA2 TR Agri, Kars, Igdir, Ard… -0.277 -1.12
## 9 EL41 EL Βόρειο Αιγαίο -2.29 0.848
## 10 TR10 TR Istanbul 2.87 -0.555
## 11 ES61 ES Andalucía 2.17 0.527
## 12 EL53 EL Δυτική Μακεδονία -2.00 0.888
## 13 LV00 LV Latvija 0.388 -2.28
## 14 TR90 TR Trabzon 0.761 -0.0742
## 15 TRB2 TR Van, Mus, Bitlis, Hakk… 0.504 -0.956
## 16 ITC2 IT Valle d'Aosta/Vallée d… -2.91 0.607
## 17 EL54 EL Ήπειρος -1.73 1.13
## 18 TRB1 TR Malatya, Elazig, Bingö… 0.251 -0.275
## 19 LU00 LU Luxembourg -1.02 0.607
## 20 RS22 RS Регион Јужне и Источне… 0.0917 -2.20
## # … with 9 more variables: unemployment_rate_t <dbl[,1]>, pc_gdp <dbl[,1]>,
## # sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## # sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## # dist_method <chr>
Maps and heat maps.
# data
eu_uniq_geo = merge(eu_geo %>% select(NUTS_ID), eu_uniq, by='NUTS_ID')
eu_uniq_geo$name = paste(eu_uniq_geo$CNTR_CODE, '-', eu_uniq_geo$NUTS_NAME)
# maps
plot_uniq_map(eu_uniq_geo)
## tmap mode set to plotting
## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
# heatmaps
gen_variable_heatmap(eu_uniq_geo)
## # A tibble: 180 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 ES - Ciudad Autónoma de Ceuta region_population -3.39e+0
## 2 ES - Ciudad Autónoma de Ceuta life_expectancy_t 2.87e-1
## 3 ES - Ciudad Autónoma de Ceuta unemployment_rate_t 2.71e+0
## 4 ES - Ciudad Autónoma de Ceuta pc_gdp -1.40e-1
## 5 ES - Ciudad Autónoma de Ceuta sum_dists_z 5.69e+0
## 6 ES - Ciudad Autónoma de Ceuta sum_dists_z_p 6.25e-9
## 7 ES - Ciudad Autónoma de Melilla region_population -3.39e+0
## 8 ES - Ciudad Autónoma de Melilla life_expectancy_t -3.41e-2
## 9 ES - Ciudad Autónoma de Melilla unemployment_rate_t 2.50e+0
## 10 ES - Ciudad Autónoma de Melilla pc_gdp -2.53e-1
## # … with 170 more rows
gen_variable_heatmap(eu_uniq_geo, F)
## # A tibble: 180 x 3
## row_id col_name value[,1]
## <chr> <chr> <dbl>
## 1 UK - South Yorkshire region_population -0.00487
## 2 UK - South Yorkshire life_expectancy_t -0.235
## 3 UK - South Yorkshire unemployment_rate_t -0.247
## 4 UK - South Yorkshire pc_gdp 0.161
## 5 UK - South Yorkshire sum_dists_z -1.20
## 6 UK - South Yorkshire sum_dists_z_p 0.886
## 7 SI - Vzhodna Slovenija region_population -0.305
## 8 SI - Vzhodna Slovenija life_expectancy_t -0.114
## 9 SI - Vzhodna Slovenija unemployment_rate_t -0.129
## 10 SI - Vzhodna Slovenija pc_gdp -0.278
## # … with 170 more rows
End of notebook